{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c0684271-1a9e-45b5-aa24-314bf8a93d2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import findspark\n",
    "findspark.init()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ac5cbc6c-ed8c-4cc0-a47e-b040de34203e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "import pyspark.sql.functions as F"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "8b64cc91-7568-45a7-8ae3-c19ba61df0c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark = (\n",
    "    SparkSession\n",
    "    .builder\n",
    "    .master(\"local[*]\")\n",
    "    .appName(\"spark_json_testing\")\n",
    "    .getOrCreate()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e6bd33da-c647-4bb0-8166-23b7d04246e6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "json_path = 'DataAnalysisWithPythonAndPySpark-Data-trunk\\shows\\shows-silicon-valley.json'\n",
    "shows = spark.read.json(json_path)\n",
    "shows.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5ebf3552-1c92-4e0e-957d-479fcc5788ae",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['_embedded',\n",
       " '_links',\n",
       " 'externals',\n",
       " 'genres',\n",
       " 'id',\n",
       " 'image',\n",
       " 'language',\n",
       " 'name',\n",
       " 'network',\n",
       " 'officialSite',\n",
       " 'premiered',\n",
       " 'rating',\n",
       " 'runtime',\n",
       " 'schedule',\n",
       " 'status',\n",
       " 'summary',\n",
       " 'type',\n",
       " 'updated',\n",
       " 'url',\n",
       " 'webChannel',\n",
       " 'weight']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "shows.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "6b3a9cb2-9653-4b4f-91f6-7c585b4b61cc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- _embedded: struct (nullable = true)\n",
      " |    |-- episodes: array (nullable = true)\n",
      " |    |    |-- element: struct (containsNull = true)\n",
      " |    |    |    |-- _links: struct (nullable = true)\n",
      " |    |    |    |    |-- self: struct (nullable = true)\n",
      " |    |    |    |    |    |-- href: string (nullable = true)\n",
      " |    |    |    |-- airdate: string (nullable = true)\n",
      " |    |    |    |-- airstamp: string (nullable = true)\n",
      " |    |    |    |-- airtime: string (nullable = true)\n",
      " |    |    |    |-- id: long (nullable = true)\n",
      " |    |    |    |-- image: struct (nullable = true)\n",
      " |    |    |    |    |-- medium: string (nullable = true)\n",
      " |    |    |    |    |-- original: string (nullable = true)\n",
      " |    |    |    |-- name: string (nullable = true)\n",
      " |    |    |    |-- number: long (nullable = true)\n",
      " |    |    |    |-- runtime: long (nullable = true)\n",
      " |    |    |    |-- season: long (nullable = true)\n",
      " |    |    |    |-- summary: string (nullable = true)\n",
      " |    |    |    |-- url: string (nullable = true)\n",
      " |-- _links: struct (nullable = true)\n",
      " |    |-- previousepisode: struct (nullable = true)\n",
      " |    |    |-- href: string (nullable = true)\n",
      " |    |-- self: struct (nullable = true)\n",
      " |    |    |-- href: string (nullable = true)\n",
      " |-- externals: struct (nullable = true)\n",
      " |    |-- imdb: string (nullable = true)\n",
      " |    |-- thetvdb: long (nullable = true)\n",
      " |    |-- tvrage: long (nullable = true)\n",
      " |-- genres: array (nullable = true)\n",
      " |    |-- element: string (containsNull = true)\n",
      " |-- id: long (nullable = true)\n",
      " |-- image: struct (nullable = true)\n",
      " |    |-- medium: string (nullable = true)\n",
      " |    |-- original: string (nullable = true)\n",
      " |-- language: string (nullable = true)\n",
      " |-- name: string (nullable = true)\n",
      " |-- network: struct (nullable = true)\n",
      " |    |-- country: struct (nullable = true)\n",
      " |    |    |-- code: string (nullable = true)\n",
      " |    |    |-- name: string (nullable = true)\n",
      " |    |    |-- timezone: string (nullable = true)\n",
      " |    |-- id: long (nullable = true)\n",
      " |    |-- name: string (nullable = true)\n",
      " |-- officialSite: string (nullable = true)\n",
      " |-- premiered: string (nullable = true)\n",
      " |-- rating: struct (nullable = true)\n",
      " |    |-- average: double (nullable = true)\n",
      " |-- runtime: long (nullable = true)\n",
      " |-- schedule: struct (nullable = true)\n",
      " |    |-- days: array (nullable = true)\n",
      " |    |    |-- element: string (containsNull = true)\n",
      " |    |-- time: string (nullable = true)\n",
      " |-- status: string (nullable = true)\n",
      " |-- summary: string (nullable = true)\n",
      " |-- type: string (nullable = true)\n",
      " |-- updated: long (nullable = true)\n",
      " |-- url: string (nullable = true)\n",
      " |-- webChannel: string (nullable = true)\n",
      " |-- weight: long (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "shows.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f8bc78e9-8650-4427-91a1-6fa8cd7c0ace",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------+--------+\n",
      "|name          |genres  |\n",
      "+--------------+--------+\n",
      "|Silicon Valley|[Comedy]|\n",
      "+--------------+--------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "array_subset = shows.select(\"name\",\"genres\")\n",
    "array_subset.show(truncate=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a3a6a687-a0ed-4ce5-a7c7-21c1eb247c7a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------+-------------+\n",
      "|          name|col_and_index|\n",
      "+--------------+-------------+\n",
      "|Silicon Valley|       Comedy|\n",
      "+--------------+-------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "array_subset = array_subset.select(\"name\",F.col(\"genres\")[0].alias('col_and_index'))\n",
    "array_subset.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "bef36bad-4672-4d59-a414-79d884ee9889",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------+----------------------+----------------------------------------+\n",
      "|name          |some_genres           |repeated_array                          |\n",
      "+--------------+----------------------+----------------------------------------+\n",
      "|Silicon Valley|[Comdy, Horror, Drama]|[Comedy, Comedy, Comedy, Comedy, Comedy]|\n",
      "+--------------+----------------------+----------------------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "array_subset_repeated = array_subset.select(\"name\",\n",
    "                                           F.lit(\"Comdy\").alias(\"one\"),\n",
    "                                           F.lit(\"Horror\").alias(\"two\"),\n",
    "                                           F.lit(\"Drama\").alias(\"three\"),\n",
    "                                           F.col(\"col_and_index\")\n",
    "                                           ).select(\"name\",\n",
    "                                                    F.array(\"one\",\"two\",\"three\").alias(\"some_genres\"),\n",
    "                                                    F.array_repeat(\"col_and_index\",5).alias(\"repeated_array\")\n",
    "                                                   )\n",
    "array_subset_repeated.show(truncate=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "9214dda8-759a-4c1b-a7ee-4ad0626d1795",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------+-----------------+--------------------+\n",
      "|          name|size(some_genres)|size(repeated_array)|\n",
      "+--------------+-----------------+--------------------+\n",
      "|Silicon Valley|                3|                   5|\n",
      "+--------------+-----------------+--------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "array_subset_repeated.select(\"name\",\n",
    "                   F.size(\"some_genres\"),\n",
    "                   F.size(\"repeated_array\")).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "59d6eae5-4b8b-4b2e-a400-8295d9579d22",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------+---------------------------+------------------------------+\n",
      "|name          |array_distinct(some_genres)|array_distinct(repeated_array)|\n",
      "+--------------+---------------------------+------------------------------+\n",
      "|Silicon Valley|[Comdy, Horror, Drama]     |[Comedy]                      |\n",
      "+--------------+---------------------------+------------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "array_subset_repeated.select(\"name\",\n",
    "                   F.array_distinct(\"some_genres\"),\n",
    "                   F.array_distinct(\"repeated_array\")).show(truncate=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "18b2697b-3360-4a08-b98f-dc54d9dd703b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------+--------+--------+-----------------------------------+\n",
      "|name          |language|type    |value                              |\n",
      "+--------------+--------+--------+-----------------------------------+\n",
      "|Silicon Valley|English |Scripted|[Silicon Valley, English, Scripted]|\n",
      "+--------------+--------+--------+-----------------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "columns = [\"name\",\"language\",\"type\"]\n",
    "shows.select(*columns,F.array(*columns).alias(\"value\")).show(truncate=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "5b9f1ff3-ac0d-471d-9242-7e76149ac84b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+--------+----+-----------------------------------+\n",
      "|name|language|type|values                             |\n",
      "+----+--------+----+-----------------------------------+\n",
      "|name|language|type|[Silicon Valley, English, Scripted]|\n",
      "+----+--------+----+-----------------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "shows_map = shows.select(*[F.lit(col) for col in columns],F.array(*columns).alias(\"values\"))\n",
    "shows_map.show(truncate=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "0fa1aba5-9a10-475e-811b-c7ba33f683b7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------------------+-----------------------------------+\n",
      "|keys                  |values                             |\n",
      "+----------------------+-----------------------------------+\n",
      "|[name, language, type]|[Silicon Valley, English, Scripted]|\n",
      "+----------------------+-----------------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "shows_map = shows_map.select(F.array(*columns).alias(\"keys\"),\"values\")\n",
    "shows_map.show(truncate=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "d90b38a0-fdbb-404e-b4fd-90518f57043e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---------------------------------------------------------------+\n",
      "|mapped                                                         |\n",
      "+---------------------------------------------------------------+\n",
      "|{name -> Silicon Valley, language -> English, type -> Scripted}|\n",
      "+---------------------------------------------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "shows_map = shows_map.select(F.map_from_arrays(\"keys\",\"values\").alias(\"mapped\"))\n",
    "shows_map.show(truncate=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "71db6929-94d2-4df5-a884-af27b765cb23",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------------------------------+\n",
      "|map_values(mapped)                 |\n",
      "+-----------------------------------+\n",
      "|[Silicon Valley, English, Scripted]|\n",
      "+-----------------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "shows_map.select(F.map_values(\"mapped\")).show(truncate=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "63c59953-3648-4d14-b94d-5fe2f0c231bb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- _embedded: struct (nullable = true)\n",
      " |    |-- episodes: array (nullable = true)\n",
      " |    |    |-- element: struct (containsNull = true)\n",
      " |    |    |    |-- _links: struct (nullable = true)\n",
      " |    |    |    |    |-- self: struct (nullable = true)\n",
      " |    |    |    |    |    |-- href: string (nullable = true)\n",
      " |    |    |    |-- airdate: string (nullable = true)\n",
      " |    |    |    |-- airstamp: string (nullable = true)\n",
      " |    |    |    |-- airtime: string (nullable = true)\n",
      " |    |    |    |-- id: long (nullable = true)\n",
      " |    |    |    |-- image: struct (nullable = true)\n",
      " |    |    |    |    |-- medium: string (nullable = true)\n",
      " |    |    |    |    |-- original: string (nullable = true)\n",
      " |    |    |    |-- name: string (nullable = true)\n",
      " |    |    |    |-- number: long (nullable = true)\n",
      " |    |    |    |-- runtime: long (nullable = true)\n",
      " |    |    |    |-- season: long (nullable = true)\n",
      " |    |    |    |-- summary: string (nullable = true)\n",
      " |    |    |    |-- url: string (nullable = true)\n",
      " |-- _links: struct (nullable = true)\n",
      " |    |-- previousepisode: struct (nullable = true)\n",
      " |    |    |-- href: string (nullable = true)\n",
      " |    |-- self: struct (nullable = true)\n",
      " |    |    |-- href: string (nullable = true)\n",
      " |-- externals: struct (nullable = true)\n",
      " |    |-- imdb: string (nullable = true)\n",
      " |    |-- thetvdb: long (nullable = true)\n",
      " |    |-- tvrage: long (nullable = true)\n",
      " |-- genres: array (nullable = true)\n",
      " |    |-- element: string (containsNull = true)\n",
      " |-- id: long (nullable = true)\n",
      " |-- image: struct (nullable = true)\n",
      " |    |-- medium: string (nullable = true)\n",
      " |    |-- original: string (nullable = true)\n",
      " |-- language: string (nullable = true)\n",
      " |-- name: string (nullable = true)\n",
      " |-- network: struct (nullable = true)\n",
      " |    |-- country: struct (nullable = true)\n",
      " |    |    |-- code: string (nullable = true)\n",
      " |    |    |-- name: string (nullable = true)\n",
      " |    |    |-- timezone: string (nullable = true)\n",
      " |    |-- id: long (nullable = true)\n",
      " |    |-- name: string (nullable = true)\n",
      " |-- officialSite: string (nullable = true)\n",
      " |-- premiered: string (nullable = true)\n",
      " |-- rating: struct (nullable = true)\n",
      " |    |-- average: double (nullable = true)\n",
      " |-- runtime: long (nullable = true)\n",
      " |-- schedule: struct (nullable = true)\n",
      " |    |-- days: array (nullable = true)\n",
      " |    |    |-- element: string (containsNull = true)\n",
      " |    |-- time: string (nullable = true)\n",
      " |-- status: string (nullable = true)\n",
      " |-- summary: string (nullable = true)\n",
      " |-- type: string (nullable = true)\n",
      " |-- updated: long (nullable = true)\n",
      " |-- url: string (nullable = true)\n",
      " |-- webChannel: string (nullable = true)\n",
      " |-- weight: long (nullable = true)\n",
      " |-- episodes: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- _links: struct (nullable = true)\n",
      " |    |    |    |-- self: struct (nullable = true)\n",
      " |    |    |    |    |-- href: string (nullable = true)\n",
      " |    |    |-- airdate: string (nullable = true)\n",
      " |    |    |-- airstamp: string (nullable = true)\n",
      " |    |    |-- airtime: string (nullable = true)\n",
      " |    |    |-- id: long (nullable = true)\n",
      " |    |    |-- image: struct (nullable = true)\n",
      " |    |    |    |-- medium: string (nullable = true)\n",
      " |    |    |    |-- original: string (nullable = true)\n",
      " |    |    |-- name: string (nullable = true)\n",
      " |    |    |-- number: long (nullable = true)\n",
      " |    |    |-- runtime: long (nullable = true)\n",
      " |    |    |-- season: long (nullable = true)\n",
      " |    |    |-- summary: string (nullable = true)\n",
      " |    |    |-- url: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "shows_clean = shows.withColumn(\"episodes\",F.col(\"_embedded.episodes\"))\n",
    "shows_clean.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "dd504b2b-7798-41cb-ad7f-7d70db0b924c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|                name|\n",
      "+--------------------+\n",
      "|[Minimum Viable P...|\n",
      "+--------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "episodes_clean = shows_clean.select(F.col(\"episodes.name\"))\n",
    "episodes_clean.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "091bc8cd-5421-404f-9803-55603db3d362",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------------------+\n",
      "|name                     |\n",
      "+-------------------------+\n",
      "|Minimum Viable Product   |\n",
      "|The Cap Table            |\n",
      "|Articles of Incorporation|\n",
      "|Fiduciary Duties         |\n",
      "|Signaling Risk           |\n",
      "+-------------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "episodes_clean.select(F.explode(F.col(\"name\")).alias(\"name\")).show(5,False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "310afe8c-bbf5-49ca-aa1d-7dd322ebafbc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyspark.sql.types as T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "ac3991f4-bd73-4fea-a921-d65a00e31b0a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|             summary|\n",
      "+--------------------+\n",
      "|<p>In the high-te...|\n",
      "+--------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "schema = T.StructType([T.StructField(\"summary\",T.StringType())])\n",
    "spark.read.schema(schema).format(\"json\").load(json_path).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "8288b278-8e2c-4909-884f-d49a53c5cb76",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- _embedded: struct (nullable = true)\n",
      " |    |-- episodes: array (nullable = true)\n",
      " |    |    |-- element: struct (containsNull = true)\n",
      " |    |    |    |-- links: struct (nullable = true)\n",
      " |    |    |    |    |-- self: struct (nullable = true)\n",
      " |    |    |    |    |    |-- href: string (nullable = true)\n",
      " |    |    |    |-- airdate: date (nullable = true)\n",
      " |    |    |    |-- airstamp: timestamp (nullable = true)\n",
      " |    |    |    |-- airtime: string (nullable = true)\n",
      " |    |    |    |-- id: long (nullable = true)\n",
      " |    |    |    |-- image: struct (nullable = true)\n",
      " |    |    |    |    |-- medium: string (nullable = true)\n",
      " |    |    |    |    |-- original: string (nullable = true)\n",
      " |    |    |    |-- name: string (nullable = true)\n",
      " |    |    |    |-- number: long (nullable = true)\n",
      " |    |    |    |-- runtime: long (nullable = true)\n",
      " |    |    |    |-- season: long (nullable = true)\n",
      " |    |    |    |-- summary: string (nullable = true)\n",
      " |    |    |    |-- url: long (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "episodes_link_schema = T.StructType([\n",
    "   T.StructField(\"self\",T.StructType([T.StructField(\"href\",T.StringType())]))\n",
    "])\n",
    "\n",
    "episodes_image_schema = T.StructType([T.StructField(\"medium\",T.StringType()),\n",
    "                                      T.StructField(\"original\",T.StringType())])\n",
    "\n",
    "episodes_schema = T.StructType([T.StructField(\"links\",episodes_link_schema),\n",
    "                                T.StructField(\"airdate\",T.DateType()),\n",
    "                                T.StructField(\"airstamp\",T.TimestampType()),\n",
    "                                T.StructField(\"airtime\",T.StringType()),\n",
    "                                T.StructField(\"id\",T.LongType()),\n",
    "                                T.StructField(\"image\",episodes_image_schema),\n",
    "                                T.StructField(\"name\",T.StringType()),\n",
    "                                T.StructField(\"number\",T.LongType()),\n",
    "                                T.StructField(\"runtime\",T.LongType()),\n",
    "                                T.StructField(\"season\",T.LongType()),\n",
    "                                T.StructField(\"summary\",T.StringType()),\n",
    "                                T.StructField(\"url\",T.LongType())])\n",
    "schema = T.StructType([T.StructField(\"_embedded\",T.StructType([T.StructField(\"episodes\",T.ArrayType(episodes_schema))]))])\n",
    "\n",
    "shows_with_schema = spark.read.schema(schema).format(\"json\").load(json_path)\n",
    "shows_with_schema.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "0a5995a5-4fc7-4e9a-bc04-8f8b08095a09",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+\n",
      "|airtime|\n",
      "+-------+\n",
      "|  22:00|\n",
      "|  22:00|\n",
      "|  22:00|\n",
      "|  22:00|\n",
      "|  22:00|\n",
      "+-------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "shows_with_schema.select(F.explode(\"_embedded.episodes\").alias(\"episode\")).select(\"episode.airtime\").show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "a6ba875d-80ff-4002-8c31-9cc381206423",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
      "|episode                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |\n",
      "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
      "|{NULL, 2014-04-06, 2014-04-07 10:00:00, 22:00, 10897, {http://static.tvmaze.com/uploads/images/medium_landscape/49/123633.jpg, http://static.tvmaze.com/uploads/images/original_untouched/49/123633.jpg}, Minimum Viable Product, 1, 30, 1, <p>Attending an elaborate launch party, Richard and his computer programmer friends - Big Head, Dinesh and Gilfoyle - dream of making it big. Instead, they're living in the communal Hacker Hostel owned by former programmer Erlich, who gets to claim ten percent of anything they invent there. When it becomes clear that Richard has developed a powerful compression algorithm for his website, Pied Piper, he finds himself courted by Gavin Belson, his egomaniacal corporate boss, who offers a $10 million buyout by his firm, Hooli. But Richard holds back when well-known investor Peter Gregory makes a counteroffer.</p>, NULL}|\n",
      "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
      "only showing top 1 row\n",
      "\n"
     ]
    }
   ],
   "source": [
    "shows_with_schema.select(F.explode(\"_embedded.episodes\").alias(\"episode\")).show(1,truncate=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "643e6410-fe0a-43fb-908e-c5d8fea3cf8a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'type': 'struct',\n",
       " 'fields': [{'name': 'episode',\n",
       "   'type': {'type': 'struct',\n",
       "    'fields': [{'name': 'links',\n",
       "      'type': {'type': 'struct',\n",
       "       'fields': [{'name': 'self',\n",
       "         'type': {'type': 'struct',\n",
       "          'fields': [{'name': 'href',\n",
       "            'type': 'string',\n",
       "            'nullable': True,\n",
       "            'metadata': {}}]},\n",
       "         'nullable': True,\n",
       "         'metadata': {}}]},\n",
       "      'nullable': True,\n",
       "      'metadata': {}},\n",
       "     {'name': 'airdate', 'type': 'date', 'nullable': True, 'metadata': {}},\n",
       "     {'name': 'airstamp',\n",
       "      'type': 'timestamp',\n",
       "      'nullable': True,\n",
       "      'metadata': {}},\n",
       "     {'name': 'airtime', 'type': 'string', 'nullable': True, 'metadata': {}},\n",
       "     {'name': 'id', 'type': 'long', 'nullable': True, 'metadata': {}},\n",
       "     {'name': 'image',\n",
       "      'type': {'type': 'struct',\n",
       "       'fields': [{'name': 'medium',\n",
       "         'type': 'string',\n",
       "         'nullable': True,\n",
       "         'metadata': {}},\n",
       "        {'name': 'original',\n",
       "         'type': 'string',\n",
       "         'nullable': True,\n",
       "         'metadata': {}}]},\n",
       "      'nullable': True,\n",
       "      'metadata': {}},\n",
       "     {'name': 'name', 'type': 'string', 'nullable': True, 'metadata': {}},\n",
       "     {'name': 'number', 'type': 'long', 'nullable': True, 'metadata': {}},\n",
       "     {'name': 'runtime', 'type': 'long', 'nullable': True, 'metadata': {}},\n",
       "     {'name': 'season', 'type': 'long', 'nullable': True, 'metadata': {}},\n",
       "     {'name': 'summary', 'type': 'string', 'nullable': True, 'metadata': {}},\n",
       "     {'name': 'url', 'type': 'long', 'nullable': True, 'metadata': {}}]},\n",
       "   'nullable': True,\n",
       "   'metadata': {}}]}"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "shows_with_schema.select(F.explode(\"_embedded.episodes\").alias(\"episode\")).schema.jsonValue()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "f031552b-1e99-42ca-bb53-ed287a805483",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'fields': [{'metadata': {},\n",
      "             'name': 'image',\n",
      "             'nullable': True,\n",
      "             'type': {'fields': [{'metadata': {},\n",
      "                                  'name': 'medium',\n",
      "                                  'nullable': True,\n",
      "                                  'type': 'string'},\n",
      "                                 {'metadata': {},\n",
      "                                  'name': 'original',\n",
      "                                  'nullable': True,\n",
      "                                  'type': 'string'}],\n",
      "                      'type': 'struct'}}],\n",
      " 'type': 'struct'}\n"
     ]
    }
   ],
   "source": [
    "import pprint\n",
    "\n",
    "pprint.pprint(shows_with_schema.\n",
    "              select(F.explode(\"_embedded.episodes\").alias(\"episode\"))\n",
    "              .select(\"episode.image\")\n",
    "              .schema.jsonValue())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "cce7e7d6-e3a3-491e-acbb-6534d12c4fc5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+\n",
      "| id|\n",
      "+---+\n",
      "|143|\n",
      "+---+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "shows.select(\"id\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "18db284d-a4f1-4418-9af2-c0f53906b0a2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+----------------------------------------------------------------------+\n",
      "| id|                                                              episodes|\n",
      "+---+----------------------------------------------------------------------+\n",
      "|143|{{{http://api.tvmaze.com/episodes/10897}}, 2014-04-06, 2014-04-07T0...|\n",
      "|143|{{{http://api.tvmaze.com/episodes/10898}}, 2014-04-13, 2014-04-14T0...|\n",
      "|143|{{{http://api.tvmaze.com/episodes/10899}}, 2014-04-20, 2014-04-21T0...|\n",
      "|143|{{{http://api.tvmaze.com/episodes/10900}}, 2014-04-27, 2014-04-28T0...|\n",
      "|143|{{{http://api.tvmaze.com/episodes/10901}}, 2014-05-04, 2014-05-05T0...|\n",
      "+---+----------------------------------------------------------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "episodes = shows.select(\"id\",F.explode(\"_embedded.episodes\").alias(\"episodes\"))\n",
    "episodes.show(5,70)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "2067a3eb-43b6-4063-9ab0-5ae55067ffd6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------------------------------------------------------------------+\n",
      "|                                                                         name_id|\n",
      "+--------------------------------------------------------------------------------+\n",
      "|{10897 -> Minimum Viable Product, 10898 -> The Cap Table, 10899 -> Articles o...|\n",
      "+--------------------------------------------------------------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "episode_name_id = shows.select(\n",
    "    F.map_from_arrays(F.col(\"_embedded.episodes.id\"),F.col(\"_embedded.episodes.name\")).alias(\"name_id\")\n",
    ")\n",
    "episode_name_id.show(truncate=80)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "ca1c22cb-305f-409c-a372-5eed7296ca02",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+-----+--------------------+\n",
      "|pos|   id|                name|\n",
      "+---+-----+--------------------+\n",
      "|  0|10897|Minimum Viable Pr...|\n",
      "|  1|10898|       The Cap Table|\n",
      "|  2|10899|Articles of Incor...|\n",
      "|  3|10900|    Fiduciary Duties|\n",
      "|  4|10901|      Signaling Risk|\n",
      "+---+-----+--------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "episode_name_id.select(F.posexplode(\"name_id\").alias(\"pos\",\"id\",\"name\")).show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "e932e48c-c03e-4574-8fac-538421a90a91",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+\n",
      "| id|            episodes|\n",
      "+---+--------------------+\n",
      "|143|[{{{http://api.tv...|\n",
      "+---+--------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "collected = episodes.groupby(\"id\").agg(\n",
    "    F.collect_list(\"episodes\").alias(\"episodes\")\n",
    ")\n",
    "collected.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "2ef3b669-2816-4692-a50a-670c9e893558",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- id: long (nullable = true)\n",
      " |-- episodes: array (nullable = false)\n",
      " |    |-- element: struct (containsNull = false)\n",
      " |    |    |-- _links: struct (nullable = true)\n",
      " |    |    |    |-- self: struct (nullable = true)\n",
      " |    |    |    |    |-- href: string (nullable = true)\n",
      " |    |    |-- airdate: string (nullable = true)\n",
      " |    |    |-- airstamp: string (nullable = true)\n",
      " |    |    |-- airtime: string (nullable = true)\n",
      " |    |    |-- id: long (nullable = true)\n",
      " |    |    |-- image: struct (nullable = true)\n",
      " |    |    |    |-- medium: string (nullable = true)\n",
      " |    |    |    |-- original: string (nullable = true)\n",
      " |    |    |-- name: string (nullable = true)\n",
      " |    |    |-- number: long (nullable = true)\n",
      " |    |    |-- runtime: long (nullable = true)\n",
      " |    |    |-- season: long (nullable = true)\n",
      " |    |    |-- summary: string (nullable = true)\n",
      " |    |    |-- url: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "collected.printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e6f56500-73d7-40d4-9347-def7c1d5098d",
   "metadata": {},
   "source": [
    "- 常用的函数：\n",
    "    - F.lit:创建常量列\n",
    "    - F.array_repeat:创建重复元素的数组\n",
    "    - F.array_distinct:去重列表元素\n",
    "    - F.map_from_arrays:传入key_array和value_array生成映射\n",
    "- schema是StructType对象，里面用列表存放了StructFild对象，StructField里面有两个必选参数，分别是name和dataType\n",
    "- 复杂类型的数据可以通过列名加点去深钻属性\n",
    "- 通过 `import pyspark.sql.types as T`导入数据类型\n",
    "- StructType提供了两种返回json格式的方法：\n",
    "    - json\n",
    "    - jsonValue\n",
    "- StructType提供了将json字符串转换为schema的from_json方法\n",
    "- 从层次结构转换成表结构，用:\n",
    "    - F.explode\n",
    "    - F.posexplode\n",
    "- 合并多行数据用collect：\n",
    "    - collect_list\n",
    "    - collect_set "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "33516dba-c19a-40c2-a30b-f7e38873f784",
   "metadata": {},
   "source": [
    "exercise 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "4ac789f3-9866-48ac-a59f-b2a83cad1e0b",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+---------+\n",
      "|one|      two|\n",
      "+---+---------+\n",
      "|  1|[1, 2, 3]|\n",
      "+---+---------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "spark.createDataFrame([{\"one\":1, \"two\":[1,2,3]}]).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "21941cbb-3dab-424c-a366-cdd98e07af3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "dict_schema = T.StructType([T.StructField(\"one\",T.IntegerType()),T.StructField(\"two\",T.ArrayType(T.IntegerType()))])\n",
    "temp = spark.createDataFrame([{\"one\":1, \"two\":[1,2,3]}],schema=dict_schema)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "7ab3062f-b5d9-464a-869f-b31aef706c8e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- one: integer (nullable = true)\n",
      " |-- two: array (nullable = true)\n",
      " |    |-- element: integer (containsNull = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "temp.printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c6e2aec1-696b-439d-a164-5d08f861687b",
   "metadata": {},
   "source": [
    "exercise 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "37f9a80e-917e-4409-9924-e5ed2e3e20b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "shows_folder_path = \"DataAnalysisWithPythonAndPySpark-Data-trunk\\shows\"\n",
    "schema = shows.schema\n",
    "three_shows = spark.read.json(os.path.join(shows_folder_path,\"shows-silicon-valley.json\"),schema=schema)\n",
    "json_path_list = [\"shows-breaking-bad.json\",\"shows-golden-girls.json\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "6380c465-a34b-449d-9381-33fbef93fe4f",
   "metadata": {},
   "outputs": [],
   "source": [
    "for json_path in json_path_list:\n",
    "    three_shows = three_shows.union(spark.read.json(os.path.join(shows_folder_path,json_path),schema=schema))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "0bbeb52d-4b13-41f6-adb1-bfda68a88f4a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "three_shows.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "d1664b99-aff7-4dab-9aae-fe4c92a4f85f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- _embedded: struct (nullable = true)\n",
      " |    |-- episodes: array (nullable = true)\n",
      " |    |    |-- element: struct (containsNull = true)\n",
      " |    |    |    |-- _links: struct (nullable = true)\n",
      " |    |    |    |    |-- self: struct (nullable = true)\n",
      " |    |    |    |    |    |-- href: string (nullable = true)\n",
      " |    |    |    |-- airdate: string (nullable = true)\n",
      " |    |    |    |-- airstamp: string (nullable = true)\n",
      " |    |    |    |-- airtime: string (nullable = true)\n",
      " |    |    |    |-- id: long (nullable = true)\n",
      " |    |    |    |-- image: struct (nullable = true)\n",
      " |    |    |    |    |-- medium: string (nullable = true)\n",
      " |    |    |    |    |-- original: string (nullable = true)\n",
      " |    |    |    |-- name: string (nullable = true)\n",
      " |    |    |    |-- number: long (nullable = true)\n",
      " |    |    |    |-- runtime: long (nullable = true)\n",
      " |    |    |    |-- season: long (nullable = true)\n",
      " |    |    |    |-- summary: string (nullable = true)\n",
      " |    |    |    |-- url: string (nullable = true)\n",
      " |-- _links: struct (nullable = true)\n",
      " |    |-- previousepisode: struct (nullable = true)\n",
      " |    |    |-- href: string (nullable = true)\n",
      " |    |-- self: struct (nullable = true)\n",
      " |    |    |-- href: string (nullable = true)\n",
      " |-- externals: struct (nullable = true)\n",
      " |    |-- imdb: string (nullable = true)\n",
      " |    |-- thetvdb: long (nullable = true)\n",
      " |    |-- tvrage: long (nullable = true)\n",
      " |-- genres: array (nullable = true)\n",
      " |    |-- element: string (containsNull = true)\n",
      " |-- id: long (nullable = true)\n",
      " |-- image: struct (nullable = true)\n",
      " |    |-- medium: string (nullable = true)\n",
      " |    |-- original: string (nullable = true)\n",
      " |-- language: string (nullable = true)\n",
      " |-- name: string (nullable = true)\n",
      " |-- network: struct (nullable = true)\n",
      " |    |-- country: struct (nullable = true)\n",
      " |    |    |-- code: string (nullable = true)\n",
      " |    |    |-- name: string (nullable = true)\n",
      " |    |    |-- timezone: string (nullable = true)\n",
      " |    |-- id: long (nullable = true)\n",
      " |    |-- name: string (nullable = true)\n",
      " |-- officialSite: string (nullable = true)\n",
      " |-- premiered: string (nullable = true)\n",
      " |-- rating: struct (nullable = true)\n",
      " |    |-- average: double (nullable = true)\n",
      " |-- runtime: long (nullable = true)\n",
      " |-- schedule: struct (nullable = true)\n",
      " |    |-- days: array (nullable = true)\n",
      " |    |    |-- element: string (containsNull = true)\n",
      " |    |-- time: string (nullable = true)\n",
      " |-- status: string (nullable = true)\n",
      " |-- summary: string (nullable = true)\n",
      " |-- type: string (nullable = true)\n",
      " |-- updated: long (nullable = true)\n",
      " |-- url: string (nullable = true)\n",
      " |-- webChannel: string (nullable = true)\n",
      " |-- weight: long (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "three_shows.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "bb943aae-e958-46b5-8b67-31ea603d5226",
   "metadata": {},
   "outputs": [],
   "source": [
    "episodes_shows= three_shows.select(\"id\",\"name\",F.explode(\"_embedded.episodes\").alias(\"episodes\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "fd075fd3-ac58-4e5e-bd6c-28ca4065dc69",
   "metadata": {},
   "outputs": [],
   "source": [
    "episodes_show = episodes_shows.select(\"id\",\"name\",F.col(\"episodes.id\").alias(\"eid\"),\"episodes.airdate\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "e5769b89-051b-4e8a-a2ad-545aa5c5a2a9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- id: long (nullable = true)\n",
      " |-- name: string (nullable = true)\n",
      " |-- eid: long (nullable = true)\n",
      " |-- airdate: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "episodes_show.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "0b770e20-fcc2-4101-b430-a7d1b96863b1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+----------------+\n",
      "| id|            name|\n",
      "+---+----------------+\n",
      "|143|  Silicon Valley|\n",
      "|169|    Breaking Bad|\n",
      "|722|The Golden Girls|\n",
      "+---+----------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "episodes_show.select(\"id\",\"name\").distinct().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "93ff2532-2600-4545-b41f-ed98356cf7f3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- id: long (nullable = true)\n",
      " |-- name: string (nullable = true)\n",
      " |-- eid: long (nullable = true)\n",
      " |-- airdate: date (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "episodes_show = episodes_show.select(\"id\",\"name\",\"eid\",F.col(\"airdate\").cast(T.DateType()))\n",
    "episodes_show.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "43cff1dd-e799-48d5-8fd7-f515c923e976",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+----------------+----------+----------+-------------------+\n",
      "| id|            name|     begin|       end|           duration|\n",
      "+---+----------------+----------+----------+-------------------+\n",
      "|722|The Golden Girls|1985-09-14|1992-05-09|INTERVAL '2429' DAY|\n",
      "|169|    Breaking Bad|2008-01-20|2013-09-29|INTERVAL '2079' DAY|\n",
      "|143|  Silicon Valley|2014-04-06|2019-12-08|INTERVAL '2072' DAY|\n",
      "+---+----------------+----------+----------+-------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "episodes_show.groupby(\"id\",\"name\").agg(F.min(\"airdate\").alias(\"begin\"),F.max(\"airdate\").alias(\"end\"),\n",
    "                                (F.max(\"airdate\")-F.min(\"airdate\")).alias(\"duration\")).orderBy(\"duration\",ascending=False).show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "id": "11a37bed-032b-4c8e-9861-1239b5c6c789",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------------+----------+----------+-------------------+\n",
      "|            name|     begin|       end|           duration|\n",
      "+----------------+----------+----------+-------------------+\n",
      "|The Golden Girls|1985-09-14|1992-05-09|INTERVAL '2429' DAY|\n",
      "|    Breaking Bad|2008-01-20|2013-09-29|INTERVAL '2079' DAY|\n",
      "|  Silicon Valley|2014-04-06|2019-12-08|INTERVAL '2072' DAY|\n",
      "+----------------+----------+----------+-------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "three_shows.select(\"name\",\n",
    "                   F.array_min(\"_embedded.episodes.airdate\").cast(\"date\").alias(\"begin\"),\n",
    "                   F.array_max(\"_embedded.episodes.airdate\").cast(\"date\").alias(\"end\")\n",
    "                  ).select(\"name\",\"begin\",\"end\",(F.col(\"end\")-F.col(\"begin\")).alias(\"duration\")).orderBy(\"duration\",ascending=False).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "47f34dbd-cef6-42aa-a661-c66d8d993f12",
   "metadata": {},
   "source": [
    "结构体的数据通过点去访问，默认是存放在了数列中"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e9f18b86-71ce-44a0-9097-a10cb802d983",
   "metadata": {},
   "source": [
    "exercise 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "id": "127d7516-216d-4396-93ac-dfa63d73e292",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------------------------+----------+\n",
      "|                         name|   airdate|\n",
      "+-----------------------------+----------+\n",
      "|       Minimum Viable Product|2014-04-06|\n",
      "|                The Cap Table|2014-04-13|\n",
      "|    Articles of Incorporation|2014-04-20|\n",
      "|             Fiduciary Duties|2014-04-27|\n",
      "|               Signaling Risk|2014-05-04|\n",
      "|       Third Party Insourcing|2014-05-11|\n",
      "|             Proof of Concept|2014-05-18|\n",
      "|Optimal Tip-to-Tip Efficiency|2014-06-01|\n",
      "|            Sand Hill Shuffle|2015-04-12|\n",
      "|          Runaway Devaluation|2015-04-19|\n",
      "|                    Bad Money|2015-04-26|\n",
      "|                     The Lady|2015-05-03|\n",
      "|                 Server Space|2015-05-10|\n",
      "|                     Homicide|2015-05-17|\n",
      "|                Adult Content|2015-05-24|\n",
      "|          White Hat/Black Hat|2015-05-31|\n",
      "|          Binding Arbitration|2015-06-07|\n",
      "|       Two Days of the Condor|2015-06-14|\n",
      "|             Founder Friendly|2016-04-24|\n",
      "|               Two in the Box|2016-05-01|\n",
      "+-----------------------------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "shows.select(F.map_from_arrays(\"_embedded.episodes.name\",\"_embedded.episodes.airdate\").alias(\"name_airdate\")\n",
    "            ).select(F.explode(\"name_airdate\").alias(\"name\",\"airdate\")).show(truncate=70)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "10ce8bac-55e7-4373-9911-6409868bbd19",
   "metadata": {},
   "source": [
    "exercise 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "id": "9434d978-1b7a-4f5c-9b12-b0bda7a95f4e",
   "metadata": {},
   "outputs": [],
   "source": [
    "exo6_8 = spark.createDataFrame([[1,2],[2,4],[3,9]],[\"one\",\"square\"])\n",
    "# exo6_8 = exo6_8.select(F.col(\"one\").cast(T.StringType()),F.col(\"square\").cast(T.StringType())\n",
    "#              )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "id": "c9d57697-0ff6-400b-bd51-986a9062fcbb",
   "metadata": {},
   "outputs": [],
   "source": [
    "answer = exo6_8.select(F.map_from_arrays(F.collect_list(\"one\"),F.collect_list(\"square\")).alias(\"one2square\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "id": "689614cf-9b96-4d83-8eda-a12160bd6f1e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------------+\n",
      "|one2square              |\n",
      "+------------------------+\n",
      "|{1 -> 2, 2 -> 4, 3 -> 9}|\n",
      "+------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "answer.show(truncate=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "id": "08cfb5ac-b74d-44ea-9058-11c7ab15ebc5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- one2square: map (nullable = false)\n",
      " |    |-- key: long\n",
      " |    |-- value: long (valueContainsNull = false)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "answer.printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27ff6cb3-9ab6-46d7-8e79-d17e2e180bf8",
   "metadata": {},
   "source": [
    "json对象的键必须是字符串，但是映射和结构体不一定，别记错了"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "424d28a4-8f69-4220-bf6d-bf92602c187e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
